import pandas as pd
import re
import jieba


def main():
    # 读取技能词典
    with open('skill.txt', 'r', encoding='utf-8') as f:
        skills = f.read().strip().split(',')

    skill_set = set()
    for skill in skills:
        skill_lower = skill.strip().lower()
        skill_set.add(skill_lower)
        # 添加中文技能到结巴词典
        if not any(c.isalpha() for c in skill_lower):
            jieba.add_word(skill_lower)

    # 读取招聘数据
    df = pd.read_csv('拉勾网2023招聘数据.csv', encoding='utf-8')

    # 定义文本处理函数
    def process_text(text):
        text = str(text)
        # 提取英文技术词汇
        en_terms = re.findall(r'[a-zA-Z0-9+#.-]+', text)
        # 处理中文部分
        chn_part = re.sub(r'[a-zA-Z0-9+#.-]+', '', text)
        chn_terms = jieba.lcut(chn_part)
        # 合并并标准化
        return list(set([term.lower() for term in en_terms + chn_terms if term.lower() in skill_set]))

    # 应用处理并创建新列
    df['skill_list'] = df['positionDetail'].apply(process_text)

    # 保存结果
    df.to_csv('processed_招聘数据.csv', index=False, encoding='utf-8-sig')


if __name__ == '__main__':
    main()